Call Libraries
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.2 ──✔ ggplot2 3.3.6 ✔ purrr 1.0.1
✔ tibble 3.1.8 ✔ dplyr 1.0.10
✔ tidyr 1.2.1 ✔ stringr 1.4.1
✔ readr 2.1.3 ✔ forcats 0.5.2 ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
library(car)
Loading required package: carData
Attaching package: ‘car’
The following object is masked from ‘package:dplyr’:
recode
The following object is masked from ‘package:purrr’:
some
library(moments)
library(glmnet)
Loading required package: Matrix
Attaching package: ‘Matrix’
The following objects are masked from ‘package:tidyr’:
expand, pack, unpack
Loaded glmnet 4.1-6
Calling the Transformed Datasets
income_cleaned = read_csv('Shiny_app/data/income_cleaned.csv')
Rows: 1921 Columns: 5── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Name, Group
dbl (3): Year, Num, Avg
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
industry_cleaned = read_csv('Shiny_app/data/industry_cleaned.csv')
Rows: 2476 Columns: 5── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): Name, Group
dbl (3): Year, Num, Avg
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Creating the Models
sat.model.summary <- function (df, field, sat.formula){
#Shapiro-Wilks test to evaluate normality
print(shapiro.test(df[[field]]))
#Kurtosis evaluation (normal distribution has a value close to 3)
print('kurtosis')
print(kurtosis(df[[field]]))
linear.model.cleaned = lm(sat.formula, data = df)
print(summary(linear.model.cleaned))
plot(linear.model.cleaned)
#histograms of response variable to check distribution
print(df %>%
ggplot(aes_string(field)) +
geom_histogram() +
labs(title = 'Average Credit Amount Distribution') +
theme(plot.title = element_text(hjust = 0.5)))
#Checking multicollinearity using VIF measurement
print(vif(linear.model.cleaned))
influencePlot(linear.model.cleaned)
#avPlots(linear.model.cleaned)
}
sat.formula <- Avg ~ .
sat.field <- 'Avg'
sat.model.summary(income_cleaned, sat.field, sat.formula)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.17297, p-value < 2.2e-16
[1] "kurtosis"
[1] 172.7518
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-11338570 -547702 94139 421302 87181761
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.157e+08 5.512e+07 -2.100 0.03590 *
Year 5.721e+04 2.731e+04 2.095 0.03634 *
NameAlternative Fuels and Electric Vehicle Recharging Property Credit -3.287e+05 1.242e+06 -0.265 0.79140
NameAlternative Minimum Tax Credit 6.538e+05 9.757e+05 0.670 0.50286
NameBeer Production Credit 3.316e+05 1.290e+06 0.257 0.79715
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 1.429e+06 9.960e+05 1.434 0.15162
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 4.098e+06 1.421e+06 2.883 0.00398 **
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.182e+06 9.927e+05 1.191 0.23386
NameBrownfield Tax Credits - Remediation Real Property Tax Credit -7.587e+04 9.920e+05 -0.076 0.93904
NameClean Heating Fuel Credit 7.148e+04 1.024e+06 0.070 0.94436
NameConservation Easement Tax Credit 1.143e+05 1.064e+06 0.107 0.91444
NameCredit for Employment of Persons with Disabilities -9.343e+05 1.119e+06 -0.835 0.40391
NameCredit for Purchase of an Automated External Defibrillator -1.449e+05 9.695e+05 -0.150 0.88117
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities 2.316e+05 1.779e+06 0.130 0.89645
NameEmpire State Apprentice Tax Credit -7.983e+05 1.524e+06 -0.524 0.60040
NameEmpire State Commercial Production Credit 2.183e+05 1.291e+06 0.169 0.86576
NameEmpire State Film Post Production Credit 2.521e+04 1.049e+06 0.024 0.98082
NameEmpire State Film Production Credit 1.145e+07 9.967e+05 11.485 < 2e-16 ***
NameEmpire State Musical and Theatrical Production Credit -2.342e+05 1.775e+06 -0.132 0.89507
NameExcelsior Jobs Program Credit 1.035e+05 9.545e+05 0.108 0.91366
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 2.752e+06 8.862e+05 3.105 0.00193 **
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 4.845e+05 9.213e+05 0.526 0.59899
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.245e+06 9.239e+05 1.348 0.17785
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners -1.025e+05 9.549e+05 -0.107 0.91455
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit -6.145e+04 9.387e+05 -0.065 0.94781
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners 9.119e+04 1.044e+06 0.087 0.93038
NameFarm Workforce Retention Credit -7.802e+03 1.285e+06 -0.006 0.99516
NameFarmers' School Tax Credit 1.440e+05 1.021e+06 0.141 0.88786
NameHire a Veteran Credit -1.238e+06 1.782e+06 -0.695 0.48721
NameHistoric Properties Rehabilitation Credit 1.352e+06 1.055e+06 1.282 0.20015
NameIndustrial or Manufacturing Business Tax Credit 4.997e+05 1.074e+06 0.465 0.64173
NameInvestment Tax Credit 4.463e+05 8.895e+05 0.502 0.61588
NameInvestment Tax Credit for the Financial Services Industry 3.976e+05 1.008e+06 0.395 0.69325
NameLife Sciences Research & Development Tax Credit -8.207e+04 2.099e+06 -0.039 0.96882
NameLong-Term Care Insurance Credit 1.246e+05 9.808e+05 0.127 0.89892
NameLow-Income Housing Credit -2.764e+04 1.113e+06 -0.025 0.98020
NameMinimum Wage Reimbursement Credit -2.931e+05 1.006e+06 -0.291 0.77075
NameMortgage Servicing Tax Credit -4.540e+05 1.085e+06 -0.418 0.67565
NameNew York Youth Jobs Program Tax Credit -2.587e+05 9.381e+05 -0.276 0.78279
NameQETC Capital Tax Credit 2.616e+05 1.386e+06 0.189 0.85032
NameQETC Employment Credit 1.335e+05 1.026e+06 0.130 0.89641
NameQETC Facilities, Operations, and Training Credit 5.153e+05 1.918e+06 0.269 0.78820
NameReal Property Tax Relief Credit for Manufacturing -2.635e+05 9.654e+05 -0.273 0.78490
NameSpecial Additional Mortgage Recording Tax Credit 3.678e+04 9.244e+05 0.040 0.96827
NameSTART-UP NY Tax Elimination Credit 4.061e+03 1.130e+06 0.004 0.99713
Group1,000,000 - 24,999,999 2.170e+05 3.501e+05 0.620 0.53548
Group100,000 - 499,999 1.352e+05 3.579e+05 0.378 0.70572
Group100,000,000 - 499,999,999 9.644e+05 3.937e+05 2.449 0.01441 *
Group25,000,000 - 49,999,999 4.519e+05 4.223e+05 1.070 0.28474
Group50,000,000 - 99,999,999 4.021e+05 4.235e+05 0.950 0.34247
Group500,000 - 999,999 2.195e+05 3.880e+05 0.566 0.57170
Group500,000,000 - and over 3.073e+06 3.846e+05 7.991 2.33e-15 ***
GroupZero or Net Loss 9.536e+05 3.339e+05 2.856 0.00433 **
Num -4.520e+02 8.569e+02 -0.527 0.59791
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3868000 on 1867 degrees of freedom
Multiple R-squared: 0.2361, Adjusted R-squared: 0.2144
F-statistic: 10.88 on 53 and 1867 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.125869 1 1.458036
Name 3.521453 43 1.014746
Group 1.510763 8 1.026124
Num 1.645484 1 1.282764
income.model <- lm(sat.formula, data = income_cleaned)
sat.model.summary(industry_cleaned, sat.field, sat.formula)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.22287, p-value < 2.2e-16
[1] "kurtosis"
[1] 138.5482
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-11581741 -371031 -23164 154182 28917959
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.926e+07 2.051e+07 -1.914 0.055677 .
Year 1.936e+04 1.016e+04 1.905 0.056959 .
NameAlternative Fuels and Electric Vehicle Recharging Property Credit 5.584e+04 5.062e+05 0.110 0.912180
NameAlternative Minimum Tax Credit 3.523e+05 4.208e+05 0.837 0.402556
NameBeer Production Credit 8.799e+04 6.261e+05 0.141 0.888249
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 1.919e+06 4.473e+05 4.290 1.86e-05 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 4.064e+06 7.278e+05 5.584 2.62e-08 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.054e+06 4.779e+05 2.206 0.027462 *
NameBrownfield Tax Credits - Remediation Real Property Tax Credit 1.327e+05 4.760e+05 0.279 0.780429
NameClean Heating Fuel Credit 2.785e+05 4.595e+05 0.606 0.544549
NameConservation Easement Tax Credit 2.743e+04 4.700e+05 0.058 0.953457
NameCredit for Employment of Persons with Disabilities 1.574e+05 5.069e+05 0.311 0.756142
NameCredit for Purchase of an Automated External Defibrillator 1.082e+05 4.465e+05 0.242 0.808639
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities 2.090e+05 8.286e+05 0.252 0.800866
NameEmpire State Apprentice Tax Credit -3.324e+05 1.012e+06 -0.329 0.742537
NameEmpire State Commercial Production Credit 3.524e+05 6.172e+05 0.571 0.568123
NameEmpire State Film Post Production Credit 5.332e+05 5.224e+05 1.021 0.307585
NameEmpire State Film Production Credit 1.170e+07 4.785e+05 24.458 < 2e-16 ***
NameEmpire State Musical and Theatrical Production Credit 7.676e+04 9.010e+05 0.085 0.932112
NameExcelsior Jobs Program Credit 7.086e+05 4.393e+05 1.613 0.106849
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 1.381e+06 4.318e+05 3.199 0.001399 **
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 3.715e+05 4.213e+05 0.882 0.377942
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.160e+06 4.192e+05 2.767 0.005700 **
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners 3.634e+05 4.311e+05 0.843 0.399359
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit 2.567e+05 4.265e+05 0.602 0.547350
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners 7.379e+04 5.041e+05 0.146 0.883634
NameFarm Workforce Retention Credit 2.727e+04 5.352e+05 0.051 0.959361
NameFarmers' School Tax Credit 1.287e+05 5.133e+05 0.251 0.802102
NameHire a Veteran Credit 1.459e+05 8.253e+05 0.177 0.859709
NameHistoric Properties Rehabilitation Credit 1.875e+06 4.841e+05 3.874 0.000110 ***
NameInvestment Tax Credit 7.182e+05 4.129e+05 1.739 0.082075 .
NameInvestment Tax Credit for the Financial Services Industry 6.339e+05 5.757e+05 1.101 0.270913
NameLife Sciences Research & Development Tax Credit -2.243e+03 9.007e+05 -0.002 0.998014
NameLong-Term Care Insurance Credit 1.385e+05 4.200e+05 0.330 0.741537
NameLow-Income Housing Credit 1.642e+06 5.494e+05 2.988 0.002833 **
NameMinimum Wage Reimbursement Credit 9.624e+04 4.381e+05 0.220 0.826159
NameMortgage Servicing Tax Credit 2.077e+05 6.462e+05 0.321 0.747934
NameNew York Youth Jobs Program Tax Credit 1.953e+05 4.273e+05 0.457 0.647604
NameQETC Capital Tax Credit 2.986e+05 5.868e+05 0.509 0.610823
NameQETC Employment Credit 8.982e+04 4.465e+05 0.201 0.840607
NameQETC Facilities, Operations, and Training Credit 3.477e+05 6.711e+05 0.518 0.604441
NameReal Property Tax Relief Credit for Manufacturing 1.520e+05 4.423e+05 0.344 0.731171
NameSpecial Additional Mortgage Recording Tax Credit 2.609e+05 4.432e+05 0.589 0.556155
NameSTART-UP NY Tax Elimination Credit 1.703e+04 4.750e+05 0.036 0.971411
GroupAdministrative and Support and Waste Management and Remediation Services -2.280e+03 2.519e+05 -0.009 0.992777
GroupAdministrative/Support/Waste Management/Remediation Services -3.173e+03 2.798e+05 -0.011 0.990952
GroupAgriculture, Forestry, Fishing and Hunting 7.269e+04 2.330e+05 0.312 0.755097
GroupArts, Entertainment, and Recreation 5.866e+05 2.317e+05 2.532 0.011408 *
GroupConstruction -1.699e+04 2.171e+05 -0.078 0.937621
GroupEducational Services 5.569e+04 2.948e+05 0.189 0.850176
GroupFinance and Insurance 2.139e+05 2.034e+05 1.051 0.293237
GroupHealth Care and Social Assistance 1.300e+04 2.290e+05 0.057 0.954731
GroupInformation -2.433e+05 2.178e+05 -1.117 0.263917
GroupManagement of Companies and Enterprises 3.355e+05 1.949e+05 1.721 0.085332 .
GroupManufacturing 6.786e+05 2.028e+05 3.346 0.000831 ***
GroupMining -1.561e+04 3.593e+05 -0.043 0.965352
GroupMining, Quarrying, and Oil and Gas Extraction 1.084e+05 3.011e+05 0.360 0.718901
GroupOther Services (except Public Administration) -6.462e+04 2.217e+05 -0.291 0.770697
GroupProfessional, Scientific, and Technical Services 3.736e+05 2.085e+05 1.792 0.073214 .
GroupReal Estate and Rental and Leasing 9.506e+04 2.044e+05 0.465 0.641882
GroupRetail Trade 4.746e+04 2.036e+05 0.233 0.815756
GroupTransportation and Warehousing -1.866e+04 2.300e+05 -0.081 0.935321
GroupUtilities 5.308e+05 2.633e+05 2.016 0.043917 *
GroupWholesale Trade 6.106e+04 2.081e+05 0.293 0.769261
Num -9.657e+02 4.272e+02 -2.260 0.023889 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1612000 on 2411 degrees of freedom
Multiple R-squared: 0.4674, Adjusted R-squared: 0.4533
F-statistic: 33.06 on 64 and 2411 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.190806 1 1.480137
Name 5.185764 42 1.019787
Group 2.724217 20 1.025371
Num 1.370920 1 1.170863
industry.model <- lm(sat.formula, data = industry_cleaned)
Selecting Specific Diagnostic plots for linear models
plot(income.model, which = 1)
plot(income.model, which = 2)
plot(income.model, which = 3)
plot(income.model, which = 5)
Correcting violation of Normality in previous model with BoxCox transform
bc_func <- function (lm.cleaned, lambda.range){
bc = boxCox(lm.cleaned, lambda = lambda.range)
#Extracting the best lambda value.
return(bc$x[which(bc$y == max(bc$y))])
}
#Income Group Dataset
income.lambda.bc = bc_func(income.model, seq(-0.2, 0.2, 1/10))
income.lambda.bc
[1] -0.01414141
#Industry Group Dataset
industry.lambda.bc = bc_func(industry.model, seq(-0.2, 0.2, 1/10))
industry.lambda.bc
[1] -0.03434343
lambda.bcs <- list('income' = income.lambda.bc, 'industry' = industry.lambda.bc)
saveRDS(lambda.bcs, 'Shiny_app/data/lambda.bcs.rds')
bc_transform <- function(df, lambda.bc){
return (df %>%
mutate(Avg.bc = (Avg^lambda.bc -1)/lambda.bc) %>%
select(-c(Avg))) #took out field Amount
}
#Income Group Dataset
income_cleaned_bc <- bc_transform(income_cleaned, income.lambda.bc)
income.model.bc = lm(Avg.bc ~ ., data = income_cleaned_bc)
#Industry Group Dataset
industry_cleaned_bc <- bc_transform(industry_cleaned, industry.lambda.bc)
industry.model.bc = lm(Avg.bc ~ ., data = industry_cleaned_bc)
Testing out bc_func for migration to Shiny App global.R file
bc_funct <- function (df, lm.cleaned, lambda.range){
bc = boxCox(lm.cleaned, lambda = lambda.range)
lambda.bc = bc$x[which(bc$y == max(bc$y))]
return(df %>%
mutate(Avg.bc = (Avg^lambda.bc -1)/lambda.bc) %>%
select(-c(Avg)))
}
bc_funct(income_cleaned, income.model, seq(-0.2, 0.2, 1/10))
44+9+1+1
[1] 55
Checking linear regression assumptions for the transformed data.
sat.formula.bc <- Avg.bc ~ .
sat.field.bc <- 'Avg.bc'
#Income
sat.model.summary(income_cleaned_bc, sat.field.bc, sat.formula.bc)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.99696, p-value = 0.000782
[1] "kurtosis"
[1] 2.744669
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-6.6597 -0.5738 -0.0113 0.5668 4.4826
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.634e+01 1.456e+01 -2.496 0.012643 *
Year 2.288e-02 7.214e-03 3.171 0.001543 **
NameAlternative Fuels and Electric Vehicle Recharging Property Credit -8.643e-01 3.281e-01 -2.634 0.008510 **
NameAlternative Minimum Tax Credit -2.052e+00 2.577e-01 -7.962 2.91e-15 ***
NameBeer Production Credit 5.687e-01 3.407e-01 1.669 0.095234 .
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 1.715e+00 2.630e-01 6.521 8.99e-11 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 2.676e+00 3.754e-01 7.130 1.43e-12 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.331e+00 2.622e-01 5.076 4.24e-07 ***
NameBrownfield Tax Credits - Remediation Real Property Tax Credit 5.458e-02 2.620e-01 0.208 0.834991
NameClean Heating Fuel Credit -3.086e+00 2.705e-01 -11.409 < 2e-16 ***
NameConservation Easement Tax Credit -2.137e+00 2.810e-01 -7.606 4.45e-14 ***
NameCredit for Employment of Persons with Disabilities -3.118e+00 2.956e-01 -10.547 < 2e-16 ***
NameCredit for Purchase of an Automated External Defibrillator -2.938e+00 2.560e-01 -11.473 < 2e-16 ***
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities -5.673e-01 4.698e-01 -1.207 0.227430
NameEmpire State Apprentice Tax Credit -2.207e+00 4.024e-01 -5.485 4.71e-08 ***
NameEmpire State Commercial Production Credit 6.075e-02 3.410e-01 0.178 0.858634
NameEmpire State Film Post Production Credit 1.101e+00 2.769e-01 3.977 7.25e-05 ***
NameEmpire State Film Production Credit 2.853e+00 2.632e-01 10.838 < 2e-16 ***
NameEmpire State Musical and Theatrical Production Credit 2.508e-01 4.688e-01 0.535 0.592803
NameExcelsior Jobs Program Credit 4.651e-01 2.521e-01 1.845 0.065178 .
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 8.876e-01 2.341e-01 3.792 0.000154 ***
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 1.775e-01 2.433e-01 0.730 0.465701
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.220e+00 2.440e-01 5.000 6.26e-07 ***
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners 1.182e-01 2.522e-01 0.469 0.639260
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit -8.497e-01 2.479e-01 -3.427 0.000623 ***
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners -1.358e+00 2.756e-01 -4.928 9.02e-07 ***
NameFarm Workforce Retention Credit -1.612e+00 3.394e-01 -4.749 2.20e-06 ***
NameFarmers' School Tax Credit -1.311e+00 2.696e-01 -4.863 1.26e-06 ***
NameHire a Veteran Credit -2.915e+00 4.706e-01 -6.195 7.15e-10 ***
NameHistoric Properties Rehabilitation Credit 1.918e+00 2.786e-01 6.886 7.81e-12 ***
NameIndustrial or Manufacturing Business Tax Credit -1.718e+00 2.836e-01 -6.059 1.66e-09 ***
NameInvestment Tax Credit 7.147e-02 2.349e-01 0.304 0.760985
NameInvestment Tax Credit for the Financial Services Industry 3.176e-01 2.662e-01 1.193 0.232870
NameLife Sciences Research & Development Tax Credit 7.033e-01 5.543e-01 1.269 0.204728
NameLong-Term Care Insurance Credit -2.863e+00 2.590e-01 -11.051 < 2e-16 ***
NameLow-Income Housing Credit -9.063e-01 2.940e-01 -3.083 0.002080 **
NameMinimum Wage Reimbursement Credit -1.241e+00 2.656e-01 -4.674 3.17e-06 ***
NameMortgage Servicing Tax Credit -9.378e-01 2.865e-01 -3.273 0.001084 **
NameNew York Youth Jobs Program Tax Credit -1.330e+00 2.478e-01 -5.367 9.01e-08 ***
NameQETC Capital Tax Credit 4.561e-01 3.661e-01 1.246 0.212956
NameQETC Employment Credit -5.888e-01 2.709e-01 -2.174 0.029855 *
NameQETC Facilities, Operations, and Training Credit 5.799e-01 5.065e-01 1.145 0.252462
NameReal Property Tax Relief Credit for Manufacturing -1.518e+00 2.550e-01 -5.956 3.09e-09 ***
NameSpecial Additional Mortgage Recording Tax Credit -2.309e-01 2.441e-01 -0.946 0.344374
NameSTART-UP NY Tax Elimination Credit -2.193e+00 2.985e-01 -7.345 3.05e-13 ***
Group1,000,000 - 24,999,999 1.090e+00 9.246e-02 11.785 < 2e-16 ***
Group100,000 - 499,999 3.590e-01 9.451e-02 3.798 0.000150 ***
Group100,000,000 - 499,999,999 1.685e+00 1.040e-01 16.202 < 2e-16 ***
Group25,000,000 - 49,999,999 1.325e+00 1.115e-01 11.883 < 2e-16 ***
Group50,000,000 - 99,999,999 1.473e+00 1.118e-01 13.172 < 2e-16 ***
Group500,000 - 999,999 6.032e-01 1.025e-01 5.886 4.67e-09 ***
Group500,000,000 - and over 2.332e+00 1.016e-01 22.953 < 2e-16 ***
GroupZero or Net Loss 9.934e-01 8.817e-02 11.267 < 2e-16 ***
Num -1.533e-03 2.263e-04 -6.775 1.66e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.022 on 1867 degrees of freedom
Multiple R-squared: 0.7368, Adjusted R-squared: 0.7293
F-statistic: 98.61 on 53 and 1867 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.125869 1 1.458036
Name 3.521453 43 1.014746
Group 1.510763 8 1.026124
Num 1.645484 1 1.282764
#Industry
sat.model.summary(industry_cleaned_bc, sat.field.bc, sat.formula.bc)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.9902, p-value = 5.826e-12
[1] "kurtosis"
[1] 2.513097
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-6.2888 -0.4697 0.0183 0.4928 3.9068
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.095e+01 1.107e+01 -5.505 4.09e-08 ***
Year 3.440e-02 5.488e-03 6.269 4.30e-10 ***
NameAlternative Fuels and Electric Vehicle Recharging Property Credit 2.665e-01 2.733e-01 0.975 0.329737
NameAlternative Minimum Tax Credit -2.463e+00 2.272e-01 -10.839 < 2e-16 ***
NameBeer Production Credit 6.334e-01 3.381e-01 1.873 0.061126 .
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 2.198e+00 2.415e-01 9.100 < 2e-16 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 2.727e+00 3.930e-01 6.939 5.06e-12 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.586e+00 2.581e-01 6.148 9.17e-10 ***
NameBrownfield Tax Credits - Remediation Real Property Tax Credit 9.695e-01 2.570e-01 3.772 0.000166 ***
NameClean Heating Fuel Credit -2.544e+00 2.481e-01 -10.252 < 2e-16 ***
NameConservation Easement Tax Credit -1.123e+00 2.538e-01 -4.423 1.02e-05 ***
NameCredit for Employment of Persons with Disabilities -1.245e+00 2.737e-01 -4.549 5.67e-06 ***
NameCredit for Purchase of an Automated External Defibrillator -1.535e+00 2.411e-01 -6.368 2.29e-10 ***
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities -8.171e-02 4.474e-01 -0.183 0.855119
NameEmpire State Apprentice Tax Credit -8.683e-01 5.464e-01 -1.589 0.112125
NameEmpire State Commercial Production Credit 6.235e-01 3.333e-01 1.871 0.061481 .
NameEmpire State Film Post Production Credit 1.439e+00 2.821e-01 5.100 3.66e-07 ***
NameEmpire State Film Production Credit 3.232e+00 2.584e-01 12.509 < 2e-16 ***
NameEmpire State Musical and Theatrical Production Credit 1.012e+00 4.865e-01 2.080 0.037608 *
NameExcelsior Jobs Program Credit 1.643e+00 2.372e-01 6.929 5.42e-12 ***
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 1.310e+00 2.332e-01 5.617 2.17e-08 ***
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 7.127e-01 2.275e-01 3.133 0.001750 **
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.707e+00 2.264e-01 7.540 6.61e-14 ***
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners 9.777e-01 2.328e-01 4.200 2.77e-05 ***
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit 1.609e-01 2.303e-01 0.699 0.484921
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners -5.241e-01 2.722e-01 -1.925 0.054320 .
NameFarm Workforce Retention Credit -8.623e-01 2.890e-01 -2.984 0.002876 **
NameFarmers' School Tax Credit -1.158e+00 2.771e-01 -4.179 3.03e-05 ***
NameHire a Veteran Credit -1.040e+00 4.456e-01 -2.335 0.019638 *
NameHistoric Properties Rehabilitation Credit 2.397e+00 2.614e-01 9.172 < 2e-16 ***
NameInvestment Tax Credit 6.981e-01 2.230e-01 3.131 0.001763 **
NameInvestment Tax Credit for the Financial Services Industry 1.560e+00 3.108e-01 5.019 5.59e-07 ***
NameLife Sciences Research & Development Tax Credit 9.057e-01 4.864e-01 1.862 0.062697 .
NameLong-Term Care Insurance Credit -1.867e+00 2.268e-01 -8.231 3.01e-16 ***
NameLow-Income Housing Credit 1.331e+00 2.967e-01 4.487 7.57e-06 ***
NameMinimum Wage Reimbursement Credit -9.161e-01 2.366e-01 -3.872 0.000111 ***
NameMortgage Servicing Tax Credit 8.932e-01 3.489e-01 2.560 0.010534 *
NameNew York Youth Jobs Program Tax Credit 1.263e-01 2.307e-01 0.548 0.584069
NameQETC Capital Tax Credit 1.162e+00 3.168e-01 3.667 0.000251 ***
NameQETC Employment Credit -3.048e-01 2.411e-01 -1.264 0.206249
NameQETC Facilities, Operations, and Training Credit 1.147e+00 3.624e-01 3.165 0.001571 **
NameReal Property Tax Relief Credit for Manufacturing -6.559e-01 2.388e-01 -2.746 0.006077 **
NameSpecial Additional Mortgage Recording Tax Credit 7.505e-01 2.393e-01 3.136 0.001736 **
NameSTART-UP NY Tax Elimination Credit -1.834e+00 2.565e-01 -7.151 1.14e-12 ***
GroupAdministrative and Support and Waste Management and Remediation Services 2.878e-01 1.360e-01 2.116 0.034417 *
GroupAdministrative/Support/Waste Management/Remediation Services 1.675e-01 1.511e-01 1.109 0.267612
GroupAgriculture, Forestry, Fishing and Hunting -1.132e-01 1.258e-01 -0.899 0.368486
GroupArts, Entertainment, and Recreation 5.595e-01 1.251e-01 4.473 8.09e-06 ***
GroupConstruction -4.299e-02 1.172e-01 -0.367 0.713860
GroupEducational Services 2.253e-01 1.592e-01 1.415 0.157160
GroupFinance and Insurance 5.350e-01 1.099e-01 4.870 1.19e-06 ***
GroupHealth Care and Social Assistance -6.777e-02 1.237e-01 -0.548 0.583720
GroupInformation 7.098e-01 1.176e-01 6.037 1.81e-09 ***
GroupManagement of Companies and Enterprises 6.672e-01 1.052e-01 6.340 2.73e-10 ***
GroupManufacturing 4.608e-01 1.095e-01 4.208 2.67e-05 ***
GroupMining 2.515e-01 1.940e-01 1.296 0.194977
GroupMining, Quarrying, and Oil and Gas Extraction 3.869e-01 1.626e-01 2.379 0.017421 *
GroupOther Services (except Public Administration) -1.550e-01 1.197e-01 -1.295 0.195482
GroupProfessional, Scientific, and Technical Services 4.875e-01 1.126e-01 4.331 1.55e-05 ***
GroupReal Estate and Rental and Leasing 1.461e-01 1.104e-01 1.324 0.185596
GroupRetail Trade 3.754e-01 1.100e-01 3.414 0.000650 ***
GroupTransportation and Warehousing 2.125e-01 1.242e-01 1.711 0.087179 .
GroupUtilities 6.972e-01 1.422e-01 4.904 1.00e-06 ***
GroupWholesale Trade 4.367e-01 1.124e-01 3.886 0.000105 ***
Num 4.542e-04 2.307e-04 1.969 0.049082 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8703 on 2411 degrees of freedom
Multiple R-squared: 0.7631, Adjusted R-squared: 0.7569
F-statistic: 121.4 on 64 and 2411 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.190806 1 1.480137
Name 5.185764 42 1.019787
Group 2.724217 20 1.025371
Num 1.370920 1 1.170863
avPlots(income.model.bc)
NA
NA
NA
NA
NA
BIC comparison before and after BoxCox transform
BIC(income.model.bc, income.model)
BIC(industry.model.bc, industry.model)
Stepwise Regression on Income_cat_bc (boxcox transformed dataset)
clean_colname <- function(cols) {
return(str_replace_all(cols, "[-'/ ,�&()`]", '_'))
}
#creating dummy variable columns for stepwise
dummy_func <- function (df){
x = model.matrix(Avg.bc ~., df)[, -1]
dummy_bc = as.data.frame(x) %>% mutate(Avg.bc = df$Avg.bc)
colnames(dummy_bc) <- clean_colname(colnames(dummy_bc))
#colnames(dummy_bc) <- str_replace_all(colnames(dummy_bc), "[-'/ ,�&()`]", '_')
return(dummy_bc)
}
Cleaning column names further so stepwise regression doesn’t present any errors
#Income Group Dataset
income.dummy.bc <- dummy_func(income_cleaned_bc)
#colnames(income.dummy.bc)[37] <- 'NameManufactureru0092s_Real_Property_Tax_Credit'
colnames(income.dummy.bc)
#Industry Group Dataset
industry.dummy.bc <- dummy_func(industry_cleaned_bc)
#colnames(industry.dummy.bc)[35] <- 'NameManufactureru0092s_Real_Property_Tax_Credit'
colnames(industry.dummy.bc)
Stepwise regression using BIC as the criteria (k = log(n)).
#Creating Stepwise Models
bcs = list(income = income.dummy.bc, industry = industry.dummy.bc)
model.fulls = list(income = lm(Avg.bc ~ ., data = income.dummy.bc), industry = lm(Avg.bc ~ ., data = industry.dummy.bc))
model.emptys = list(income = lm(Avg.bc ~ 1, data = income.dummy.bc), industry = lm(Avg.bc ~ Num, data = industry.dummy.bc))
k = c('income', 'industry')
forwardBIC = list(income = NULL, industry = NULL)
backwardBIC = list(income = NULL, industry = NULL)
for (i in k){
bc = bcs[[i]]
scope = list(lower = formula(model.emptys[[i]]), upper = formula(model.fulls[[i]]))
n_obs = bc %>% dplyr::count() %>% dplyr::first()
forwardBIC[[i]] = step(model.emptys[[i]], scope, direction = "forward", k = log(n_obs))
backwardBIC[[i]] = step(model.fulls[[i]], scope, direction = "backward", k = log(n_obs))
}
Selecting Best Formula per Dataset from Stepwise Regressions
as.data.frame(rbind(c('income', 'forward', bic_func(forwardBIC[['income']])),
c('income', 'backward', bic_func(backwardBIC[['income']])),
c('industry', 'forward', bic_func(forwardBIC[['industry']])),
c('industry', 'backward', bic_func(backwardBIC[['industry']]))
)
) %>% select(dataset = V1, Stepwise = V2, `Adjusted R^2` = V3, `Number of Coefficients` = V4, `Maximum VIF` = V5) #%>% write_csv('Shiny_app/data/stepwiseBIC_results.csv')
[1] "Adjusted R Squared:"
[1] 0.7287541
[1] "Number of Coefficients:"
[1] 41
[1] "VIF Check: "
[1] 1.867767
[1] "*************************"
[1] "Adjusted R Squared:"
[1] 0.7287541
[1] "Number of Coefficients:"
[1] 41
[1] "VIF Check: "
[1] 1.867767
[1] "*************************"
[1] "Adjusted R Squared:"
[1] 0.7499617
[1] "Number of Coefficients:"
[1] 38
[1] "VIF Check: "
[1] 1.572564
[1] "*************************"
[1] "Adjusted R Squared:"
[1] 0.7530275
[1] "Number of Coefficients:"
[1] 44
[1] "VIF Check: "
[1] 2.348512
[1] "*************************"
Best Model Selection from Stepwise
#The Best Models selected for both income and industry were forwardBIC.
#Income Group Dataset
income.best.formula <- backwardBIC[['income']]$call[[2]]
income.best.formula
#Industry Group Dataset
industry.best.formula <- backwardBIC[['industry']]$call[[2]]
industry.best.formula
Splitting data up into test data and training data (test data is for year 2019, training is the rest)
test_train_split <- function(dummy_bc, best.formula) {
# data.test <- dummy_bc %>% filter(Year == 2019)
# data.train <- dummy_bc %>% filter(Year != 2019)
X <- model.matrix(best.formula, data = dummy_bc)[,-1]
y <- as.matrix(dummy_bc %>% select(all.vars(best.formula)[1]))
set.seed(0)
train.i = sample(1:nrow(dummy_bc), 0.8*nrow(dummy_bc), replace = F)
#train
X.train <- X[train.i,]
y.train <- y[train.i,]
#test
X.test <- X[-train.i,]
y.test <- y[-train.i,]
data.train <- as.data.frame(cbind(y.train, X.train))
data.test <- as.data.frame(cbind(y.test, X.test))
colnames(data.train)[1] = all.vars(best.formula)[1]
colnames(data.test)[1] = all.vars(best.formula)[1]
return (list('X.train' = X.train, 'y.train' = y.train, 'X.test' = X.test, 'y.test' = y.test, 'data.train' = data.train, 'data.test' = data.test, 'data' = dummy_bc))
}
Lasso regression for comparison to Forward Stepwise and Function to show metrics (R^2 and MSE) for Regularization (Ridge/Lasso)
regularization_func <- function (data, alpha, name){
X.train <- data[['X.train']]
y.train <- data[['y.train']]
X.test <- data [['X.test']]
y.test <- data[['y.test']]
data.test <- data[['data.test']]
# print(colnames(X.train))
# print(colnames(X.test))
# print(setdiff(colnames(X.train), colnames(X.test)))
#create lambda grid
lambda.grid = 10^seq(10, -10, length = 100)
#create lasso models with lambda.grid
lasso.models = glmnet(X.train, y.train, alpha = alpha, lambda = lambda.grid)
#visualize coefficient shrinkage
# plot(lasso.models, xvar = "lambda", label = TRUE, main = paste("Lasso Regression:", i))
#Cross Validation to find best lambda
set.seed(0)
cv.lasso.models <- cv.glmnet(X.train, y.train, alpha = alpha, lambda = lambda.grid, nfolds = 10)
#visualize cross validation for lambda that minimizes the mean squared error.
plot(cv.lasso.models, main = paste("Alpha:", alpha, "Regression:", name))
#Checking the best lambda
# log(cv.lasso.models$lambda.min)
# best.lambda <- cv.lasso.models$lambda.min
# print(paste(i, ' best.lambda:', best.lambda))
# best lambda with all the variables was found to be 0.0006892612
# best lambda with only the bwdBIC coefficients included was found to be 0.0003053856
#looking at the lasso coefficients for the best.lambda
# best.lambda.coeff <- predict(lasso.models, s = cv.lasso.models$lambda.min, type = "coefficients")
# print('Number of Coefficients:')
# print(dim(best.lambda.coeff)[1])
#fitting a model with the best lambda found to be 0.000689 and using it to make predictions for the testing data.
lasso.best.lambda.train.pred <- predict(lasso.models, s = cv.lasso.models$lambda.min, newx = X.test)
lasso.best.lambda.train.pred
#checking MSE
MSE.lasso <- mean((lasso.best.lambda.train.pred - y.test)^2)
print(paste('Dimensions of the Alpha:', alpha, ' Regression Coefficients for:', name))
print(dim(coef(lasso.models))[1])
p = dim(coef(lasso.models))[1]
return(eval_results(y.test, lasso.best.lambda.train.pred, data.test, p)['Rsquare'])
}
Function for calculating adjusted R squared
# Calculate R squared from true values and predictions
eval_results <- function(true, predicted, df, p) {
n = nrow(df)
adj.RSS <- sum((predicted - true)^2)/(n-p-1)
adj.TSS <- sum((true - mean(true))^2)/(n-1)
adj.R_square <- 1 - adj.RSS / adj.TSS
#RMSE = sqrt(RSS/n)
return(c(Rsquare = adj.R_square))
}
Creates data structure that stores formulas and full,train, and test datasets for final model metric comparison
formulas <- list(income_cleaned = sat.formula,
income_cleaned_bc = sat.formula.bc,
income.data.split.sat = sat.formula.bc,
income.data.split.best = income.best.formula,
industry_cleaned = sat.formula,
industry_cleaned_bc = sat.formula.bc,
industry.data.split.sat = sat.formula.bc,
industry.data.split.best = industry.best.formula)
all.splits <- list(
'income_cleaned' = test_train_split(income_cleaned, formulas[['income_cleaned']]),
'income_cleaned_bc' = test_train_split(income_cleaned_bc, formulas[['income_cleaned_bc']]),
'income.data.split.sat' = test_train_split(income.dummy.bc, formulas[['income.data.split.sat']]),
'income.data.split.best' = test_train_split(income.dummy.bc, formulas[['income.data.split.best']]),
'industry_cleaned' = test_train_split(industry_cleaned, formulas[['industry_cleaned']]),
'industry_cleaned_bc' = test_train_split(industry_cleaned_bc, formulas[['industry_cleaned_bc']]),
'industry.data.split.sat' = test_train_split(industry.dummy.bc, formulas[['industry.data.split.sat']]),
'industry.data.split.best' = test_train_split(industry.dummy.bc, formulas[['industry.data.split.best']])
)
regularization_func(all.splits[['income_cleaned']], 0, 'income_cleaned')
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: income_cleaned"
[1] 54
Rsquare
-0.01019062
Creating models for rsquared comparison
no.reg
$Rsquare
Rsquare Rsquare Rsquare Rsquare Rsquare Rsquare Rsquare Rsquare
-0.005015543 0.675827060 0.675827060 0.685686397 -0.063341734 0.693459302 0.693459302 0.702857839
$RMSE
RMSE RMSE RMSE RMSE RMSE RMSE RMSE RMSE
5.909580e+06 1.031930e+00 1.031930e+00 1.035938e+00 1.129791e+06 8.933339e-01 8.933339e-01 9.007535e-01
Adjusted R squared table for model comparison between Lasso, Ridge and No reg on cleaned, cleaned_bc, data.split.sat and data.split.best
Generating predictions using best model
#Making predictions using best model on our full dataset
forwardBIC.predict.bc <- predict(forwardBIC[['income']], newdata = all.splits[['income.data.split.best']][['data']])
#Converting those target variable predictions from BoxCox transform back to Dollars
predictions.df <- as.data.frame(forwardBIC.predict.bc) %>% mutate(`Predicted Average in Dollars` = (forwardBIC.predict.bc*income.lambda.bc+1)^(1/income.lambda.bc)) %>% tibble::rownames_to_column(var = 'Index')
predictions.df
data.df <- income_cleaned_bc %>% tibble::rownames_to_column(var = 'Index')
data.df
#joining the predicted data with the original dataset
joined.pred.df <- inner_join(predictions.df, data.df, by = 'Index')
joined.pred.df
#Creating Master dataset with True Avg.bc, Predicted Avg.bc, True Average in Dollars and Predicted Average in Dollars for each Year, Credit Name and Income Group entry.
income.joined.true.pred <- joined.pred.df %>% mutate(`True Average in Dollars` = (Avg.bc*income.lambda.bc+1)^(1/income.lambda.bc)) %>% relocate(`True Average in Dollars`, .before = `Predicted Average in Dollars`) %>% relocate(Avg.bc, .before = forwardBIC.predict.bc) %>% select(1:9, `Predicted Avg.bc` = forwardBIC.predict.bc, `True Avg.bc` = Avg.bc)
all.splits[['income.data.split.best']][['data']] %>% select(Num)
income.joined.true.pred %>% filter(Year == 2019, Group == '$1 - $99,999', Name == 'Alcoholic Beverage Production Credit')
Creating key dataframe for changes between user input colnames and dummy dataset colnames
# #Income
# as.data.frame(colnames(income.dummy.bc))
# name_key_df <- income_cleaned %>% select(col = Name) %>% distinct() %>% mutate(dummy_col = paste0('Name', clean_colname(col)))
# name_key_df
#
# group_key_df <- income_cleaned %>% select(col = Group) %>% distinct() %>% mutate(dummy_col = paste0('Group', clean_colname(col)))
#
# income_key_df <- rbind(name_key_df, group_key_df, c('Year', 'Year'), c('Num', 'Num'))
# user_input_df <- income_key_df %>% mutate(value = 0)
# user_input_df
#
# user_input_df_updated <- user_input_df %>% mutate(value = ifelse(col == 'Alcoholic Beverage Production Credit', 1, value), value = ifelse(col == 'Year', 2019, value), value = ifelse(col == 'Num', 3, value), value = ifelse(col == '$1 - $99,999', 1, value)) %>% select(-col) %>% pivot_wider(names_from = dummy_col, values_from = value)
#
# rbind(user_input_df_updated, user_input_df_updated) #%>% mutate(Num = c(3,8))
#
# user_input_df_updated <- user_input_df_updated[rep(1,5),] %>% mutate(Num = c(1,50,100,200,500))
#
# user_input_df_updated %>% select(NameAlcoholic_Beverage_Production_Credit, Year, Num, `Group$1___$99_999`)
# boxcox_to_dollars <- function(x){
# (x*income.lambda.bc+1)^(1/income.lambda.bc)
# }
#
# predict(forwardBIC[['income']], newdata = user_input_df_updated, interval = 'confidence')
#
# user_input_pred <- predict(forwardBIC[['income']], newdata = user_input_df_updated, interval = 'confidence')
#
# user_input_pred_final <- as.data.frame(user_input_pred) %>% mutate(fit_dollars = boxcox_to_dollars(fit), lwr_dollars = boxcox_to_dollars(lwr), upr_dollars = boxcox_to_dollars(upr)) %>% mutate(Num = user_input_df_updated$Num)
#
# user_input_pred_final %>% ggplot(aes(Num, fit_dollars)) + geom_point() + geom_errorbar(aes(ymin = lwr_dollars, ymax = upr_dollars)) + geom_smooth()
#
# user_input_pred_final %>% ggplot(aes(Num, fit_dollars)) + geom_ribbon(aes(ymin = lwr_dollars, ymax = upr_dollars), fill = 'grey70') + geom_point() + geom_errorbar(aes(ymin = lwr_dollars, ymax = upr_dollars)) + geom_line()
# #Industry
# as.data.frame(colnames(industry.dummy.bc))
#
# industry_name_key_df <- income_cleaned %>% select(col = Name) %>% distinct() %>% mutate(dummy_col = paste0('Name', clean_colname(col)))
# industry_name_key_df
#
# group_key_df <- income_cleaned %>% select(col = Group) %>% distinct() %>% mutate(dummy_col = paste0('Group', clean_colname(col)))
#
# income_key_df <- rbind(name_key_df, group_key_df, c('Year', 'Year'), c('Num', 'Num'))
For loop to create Master dataset with Industry and Income for true and predicted values of Avg.bc and Avg in Dollars
#For loop initialization list
joined.true.pred <- list('income' = NULL, 'industry' = NULL)
for (i in c('income', 'industry')) {
data.index <- paste0(i, '.data.split.best')
m <- lm(formulas[[data.index]], data = all.splits[[data.index]][['data']])
#Making predictions using best model on our full dataset
m.predict <- predict(m, newdata = all.splits[[data.index]][['data']])
#Converting those target variable predictions from BoxCox transform back to Dollars
lambda.bc <- lambda.bcs[[i]]
predictions.df <- as.data.frame(m.predict) %>%
mutate(`Predicted Average in Dollars` = (m.predict*lambda.bc+1)^(1/lambda.bc)) %>%
tibble::rownames_to_column(var = 'Index')
#Loading true values for join with predictions
true.df <- all.splits[[paste0(i, '_cleaned_bc')]][['data']] %>% tibble::rownames_to_column(var = 'Index')
#Joining the predicted data with the true dataset
joined.true.pred[[i]] <- inner_join(predictions.df, true.df, by = 'Index')
#Creating Master dataset with True Avg.bc, Predicted Avg.bc, True Average in Dollars and Predicted Average in Dollars for each Year,
#Credit Name and Income Group entry.
joined.true.pred[[i]] <- joined.true.pred[[i]] %>%
mutate(`True Average in Dollars` = (Avg.bc*lambda.bc+1)^(1/lambda.bc)) %>%
relocate(`True Average in Dollars`, .before = `Predicted Average in Dollars`) %>%
relocate(Avg.bc, .before = m.predict) %>%
select(1:9,
`Predicted Avg.bc` = m.predict,
`True Avg.bc` = Avg.bc)
}
joined.true.pred[['income']]
joined.true.pred[['industry']]
combined.true.pred <- rbind(joined.true.pred[['income']] %>% mutate(dataset = 'income'), joined.true.pred[['industry']] %>% mutate(dataset = 'industry'))
combined.true.pred
all.splits[['income.data.split.best']][['data']]
Writing combined predicted and true values dataset to a csv file
combined.true.pred #%>% write_csv('Shiny_app/data/combined.true.pred.csv')
Visualizations of predictions and true values
income.joined.true.pred %>% filter(Name == 'Investment Tax Credit', Group == '500,000,000 - and over') %>% ggplot(aes(Year, `True Average in Dollars`)) + geom_col()
income_cleaned_bc %>% filter(Name == 'Investment Tax Credit') %>% group_by(Group, Name) %>% summarise(`Average Number of Taxpayers` = mean(Num))
income_cleaned_bc %>% filter(Name == 'Investment Tax Credit') #%>% mutate(Year = Year + 10)
#True vs Predicted values scatterplot with regression line fitted
income.joined.true.pred %>% ggplot(aes(x = `True Avg.bc`, y = `Predicted Avg.bc`)) + geom_point() + geom_smooth(method = 'lm')
joined.true.pred %>% ggplot(aes(x = Avg.bc, y = log(`Predicted Average in Dollars`))) + geom_point()
#joined.pred.df %>% ggplot(aes(x = `Average in Dollars`)) + geom_density(aes(color = Year))
Number of taxpayers per Group for a given Credit Name and Year input (Inputs in the filter)
income_cleaned %>% filter(Year > 2017, Name == 'Investment Tax Credit', Group == 'Zero or Net Loss') %>% group_by(Name, Year, Group) %>% summarise(Num = sum(Num)) %>% arrange(desc(Num))
`summarise()` has grouped output by 'Name', 'Year'. You can override using the `.groups` argument.
Saving best model and best formula to RDS files
best.saved.model[['income']]
Call:
lm(formula = Avg.bc ~ Year + NameAlternative_Fuels_and_Electric_Vehicle_Recharging_Property_Credit +
NameAlternative_Minimum_Tax_Credit + NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___On_or_after_6_23_08_but_before_7_1_15 +
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___On_or_after_7_1_15 +
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___Prior_to_6_23_08 +
NameClean_Heating_Fuel_Credit + NameConservation_Easement_Tax_Credit +
NameCredit_for_Employment_of_Persons_with_Disabilities +
NameCredit_for_Purchase_of_an_Automated_External_Defibrillator +
NameEmpire_State_Apprentice_Tax_Credit + NameEmpire_State_Film_Post_Production_Credit +
NameEmpire_State_Film_Production_Credit + NameEZ_QEZE_Tax_Credits___EZ_Investment_Tax_Credit +
NameEZ_QEZE_Tax_Credits___QEZE_Credit_for_Real_Property_Taxes +
NameEZ_QEZE_Tax_Credits___QEZE_Tax_Reduction_Credit + NameEZ_QEZE_Tax_Credits___QEZE_Tax_Reduction_Credit_For_Corporate_Partners +
NameFarm_Workforce_Retention_Credit + NameFarmers__School_Tax_Credit +
NameHire_a_Veteran_Credit + NameHistoric_Properties_Rehabilitation_Credit +
NameIndustrial_or_Manufacturing_Business_Tax_Credit + NameLong_Term_Care_Insurance_Credit +
NameLow_Income_Housing_Credit + NameMinimum_Wage_Reimbursement_Credit +
NameMortgage_Servicing_Tax_Credit + NameNew_York_Youth_Jobs_Program_Tax_Credit +
NameQETC_Employment_Credit + NameReal_Property_Tax_Relief_Credit_for_Manufacturing +
NameSpecial_Additional_Mortgage_Recording_Tax_Credit + NameSTART_UP_NY_Tax_Elimination_Credit +
Group1_000_000___24_999_999 + Group100_000___499_999 + Group100_000_000___499_999_999 +
Group25_000_000___49_999_999 + Group50_000_000___99_999_999 +
Group500_000___999_999 + Group500_000_000___and_over + GroupZero_or_Net_Loss +
Num, data = income.dummy.bc)
Coefficients:
(Intercept)
-40.021611
Year
0.024793
NameAlternative_Fuels_and_Electric_Vehicle_Recharging_Property_Credit
-1.055822
NameAlternative_Minimum_Tax_Credit
-2.199400
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___On_or_after_6_23_08_but_before_7_1_15
1.527319
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___On_or_after_7_1_15
2.482387
NameBrownfield_Tax_Credits___Redevelopment_Tax_Credit___Prior_to_6_23_08
1.143966
NameClean_Heating_Fuel_Credit
-3.261778
NameConservation_Easement_Tax_Credit
-2.323333
NameCredit_for_Employment_of_Persons_with_Disabilities
-3.302264
NameCredit_for_Purchase_of_an_Automated_External_Defibrillator
-3.122886
NameEmpire_State_Apprentice_Tax_Credit
-2.401088
NameEmpire_State_Film_Post_Production_Credit
0.912991
NameEmpire_State_Film_Production_Credit
2.667659
NameEZ_QEZE_Tax_Credits___EZ_Investment_Tax_Credit
0.714694
NameEZ_QEZE_Tax_Credits___QEZE_Credit_for_Real_Property_Taxes
1.037080
NameEZ_QEZE_Tax_Credits___QEZE_Tax_Reduction_Credit
-1.032045
NameEZ_QEZE_Tax_Credits___QEZE_Tax_Reduction_Credit_For_Corporate_Partners
-1.542525
NameFarm_Workforce_Retention_Credit
-1.801838
NameFarmers__School_Tax_Credit
-1.495444
NameHire_a_Veteran_Credit
-3.105583
NameHistoric_Properties_Rehabilitation_Credit
1.729573
NameIndustrial_or_Manufacturing_Business_Tax_Credit
-1.872504
NameLong_Term_Care_Insurance_Credit
-3.047331
NameLow_Income_Housing_Credit
-1.096309
NameMinimum_Wage_Reimbursement_Credit
-1.427066
NameMortgage_Servicing_Tax_Credit
-1.128740
NameNew_York_Youth_Jobs_Program_Tax_Credit
-1.515478
NameQETC_Employment_Credit
-0.775319
NameReal_Property_Tax_Relief_Credit_for_Manufacturing
-1.703769
NameSpecial_Additional_Mortgage_Recording_Tax_Credit
-0.413975
NameSTART_UP_NY_Tax_Elimination_Credit
-2.381861
Group1_000_000___24_999_999
1.103649
Group100_000___499_999
0.360584
Group100_000_000___499_999_999
1.693995
Group25_000_000___49_999_999
1.327502
Group50_000_000___99_999_999
1.476808
Group500_000___999_999
0.607645
Group500_000_000___and_over
2.336912
GroupZero_or_Net_Loss
1.012104
Num
-0.001612